#############################################
# File to put together data set and run     #
# principal component analysis for:         #
# "The Arc of Modernization: Economic       #
# Structure, Materialism, and the           #
# Onset of Civil Conflict"                  #
#                                           #
# J. Tyson Chatagnier and Emanuele Castelli #
#                                           #
# Forthcoming in Political Science Research #
# and Methods                               #
#                                           #
# File created 31 March 2016                #
#############################################

rm(list = ls())

library(foreign)
library(countrycode)
library(reshape)
library(ggplot2)

##########################################
# Necessary functions for data cleaning, #
# interpolation, PCA, and plotting       #
##########################################

LinInterp <- function(data,
	                    obs,
	                    var
	                    ) {
	ccode <- data$country[obs]
	year <- data$year[obs]
  if (is.na(data[obs, var])) {
  	full.data <- na.omit(data[data$country == ccode, ])
  	                    # Gets the full regular data set for a given country.
  	if (any(full.data$year < year) & any(full.data$year > year)) {
  		x <- year
  		x0 <- full.data$year[max(which(full.data$year < year))]
  	  x1 <- full.data$year[min(which(full.data$year > year))]
  	  y0 <- full.data[full.data$year == x0, var]
  	  y1 <- full.data[full.data$year == x1, var]
      y <- y0 + (y1 - y0) * ((x - x0) / (x1 - x0))
  	} else {
  		y <- NA
  	}
  } else {
  	y <- data[obs, var]
  }
  return(y)
}

SplineInterp <- function(years.vec,
	                       vals.vec
	                       ) {
	require(splines)
	yrs.nona <- years.vec[!is.na(vals.vec)]
	vals.nona <- na.omit(vals.vec)
	int <- interpSpline(vals.nona ~ yrs.nona)
	vec.interp <- predict(int,
		                    years.vec
		                    )$y
	return(vec.interp)
}
                        # SplineInterp() can be used to verify that
                        # interpolation with splines is very
                        # similar to linear interpolation.

PctChange <- function(x,
                      ccodes
                      ) {
  x <- ifelse(x <= 0, 
              NA, 
              x
              )
                        # x must be strictly positive
  x.split <- split(x,
                   ccodes
                   )
  x.change <- lapply(x.split, 
                     function (x) {
                      c(diff(x),
                        NA
                        ) / x
                     }
                     )
  change <- do.call(c,
                    x.change
                    )
  return(change)
}

PCbiplot <- function(PC, 
	                   x           = "PC1", 
                     y           = "PC2",
                     obs.names   = NULL,
                     dim.1       = "Dimension 1",
                     dim.2       = "Dimension 2",
                     hilight.sub = NULL
                     ) {
	                      # Code to create the biplot from PCA
                        # is adapted from a response on stackoverflow:
	                      # http://stackoverflow.com/questions/6578355
	require(ggplot2)
	require(grid)
	if (is.null(obs.names)) {
		obs.names <- rownames(PC$x)
	}
  data <- data.frame(obsnames = obs.names, 
  	                 PC$x
  	                 )
  plot <- ggplot(data, 
  	             aes_string(x = x, 
  	             	          y = y)
  	                        ) + 
                   geom_text(alpha =.4, 
                   	         size  = 3, 
                   	         aes(label = obsnames)
                   	         ) + 
                   geom_hline(yintercept = 0, 
  	                          size       = .2
  	                          ) + 
                   geom_vline(xintercept = 0, 
                   	          size       = .2
                   	          )
  if (!is.null(hilight.sub)) {
    plot <- plot + geom_text(data   = data[hilight.sub, ],
                        # This allows us to highlight particular values
                        # in the data set, if desired.
                             size   = 3, 
                             aes(label = obsnames),
                             colour = "lightblue"
                             )
  }
  datapc <- data.frame(varnames = rownames(PC$rotation), 
  	                   PC$rotation
  	                   )
  mult <- min(
      (max(data[,y]) - min(data[,y]) / (max(datapc[,y]) - min(datapc[,y]))),
      (max(data[,x]) - min(data[,x]) / (max(datapc[,x]) - min(datapc[,x])))
      )
  datapc <- transform(datapc,
                      v1 = .7 * mult * (get(x)),
                      v2 = .7 * mult * (get(y))
                      )
  plot <- plot + 
          coord_equal() +
          geom_text(data = datapc, 
          	        aes(x     = v1, 
          	        	  y     = v2, 
          	        	  label = varnames
          	        	  ), 
          	        size  = 5, 
          	        vjust = 1, 
          	        color = "red"
          	        ) +
          geom_segment(data = datapc,
          	           aes(x    = 0, 
          	           	   y    = 0, 
          	           	   xend = v1, 
          	           	   yend = v2
          	           	   ), 
          	           arrow = arrow(length = unit(0.2,
          	           	                           "cm"
          	           	                           )
          	                         ), 
          	           alpha = 0.75, 
          	           color ="red"
          	           ) +
          xlab(dim.1) +
          ylab(dim.2) +
          theme_bw()
  plot
}

PlotSub <- function(PC, 
                    x           = "PC1", 
                    y           = "PC2",
                    obs.names   = NULL,
                    dim.1       = "Dimension 1",
                    dim.2       = "Dimension 2",
                    sub.plot    = NULL
                    ) {
  require(ggplot2)
  require(grid)
  if (is.null(obs.names)) {
    obs.names <- rownames(PC$x)
  }
  data <- data.frame(obsnames = obs.names, 
                     PC$x
                     )
  data <- data[sub.plot, ]
  plot <- ggplot(data, 
                 aes_string(x = x, 
                            y = y
                            )
                 ) + 
                   geom_text(size  = 3, 
                             aes(label = obsnames)
                             ) + 
                   geom_hline(yintercept = 0, 
                              size       = .2
                              ) + 
                   geom_vline(xintercept = 0, 
                              size       = .2
                              ) +
                   xlab(dim.1) +
                   ylab(dim.2) +
                   theme_bw()

  plot
}

GenLag <- function(x,
                   ccodes,
                   years,
                   lags = 1
                   ) {
                        # This is a function to
                        # generate a lagged variable for
                        # pooled data.
                        # x is the variable we want to lag
                        # ccodes is a vector of country codes
                        # years is a vector of years
                        # lags is the number of lags we want to use
                        # NOTE: lags can be negative if we want
                        # to generate a leading variable.
  require(plyr)
  lags.fn <<- lags
                        # ddply will evaluate global variables
                        # only so we need to use a global assignment
  df <- data.frame(x     = x,
                   ccode = ccodes,
                   year  = years
                   )
  if (lags > 0) {
    x.lag <- ddply(df, 
                   .(ccode),
                   transform,
                   x = c(rep(NA,
                             times = lags.fn
                             ), 
                         x[-((length(x) - lags.fn + 1) : length(x))])
                   )
  } else {
    x.lag <- ddply(df, 
                   .(ccode),
                   transform,
                   x = c(x[-(1 : abs(lags.fn))],
                         rep(NA,
                             times = abs(lags.fn)
                             )
                         )
                   )
  }
  rm(lags.fn,
     pos = ".GlobalEnv"
     )
                        # Remove the global variable we created.
  return(x.lag$x)
}

################
# Prepare data #
################

wars <- read.dta("koubi_boehmelt_jpr_replication.dta")
                        # Civil War data come from Koubi and Boehmelt's 
                        # 2014 replication data for JPR article

states <- read.dta("state_data.dta")
                        # State-level data on Polity scores, GDP, 
                        # urbanization, and sector shares of GDP

cw <- merge(states,
            wars,
            by    = c("ccode",
                      "year"
                      ),
            all.x = TRUE
            )

##########################
# Create other important #
# variables              #
##########################

cw$p2 <- cw$polity ^ 2

cw$lngdp <- log(cw$gdp + 1)

cw$gdp.growth <- PctChange(cw$gdp,
                           cw$ccode
                           )

####################
# Add in education #
####################

ed <- read.dta("fem_pop_25_over.dta")
                        # Read in the female education data, taken
                        # from the World Bank.

ed.att <- data.frame(country = ed$country,
	                   year    = ed$year,
	                   educ    = ed$ls + ed$lh
	                   )
                        # Create a variable for % of females 25 or older
                        # who have attained some secondary schooling or
                        # higher.

ed.cy <- expand.grid(country = unique(ed$country),
	                   year    = seq(from = 1950,
	                   	             to   = 2010
	                   	             )
	                   )

ed.new <- merge(ed.cy,
	              ed.att,
	              all.x = TRUE
	              )
                        # Educational data is given in five-year increments;
                        # create a data frame with country-year observations

ccodes <- countrycode(ed.new$country,
                      "country.name",
                      "cown"
                      )
                        # Get COW country codes from state names

ed.new$educ <- sapply(seq(from = 1,
	                        to   = nrow(ed.new)
	                        ),
                      LinInterp,
                      data = ed.new,
                      var  = "educ"
                      )
                        # Perform linear interpolation on the education data

ed.new$ccode <- ccodes

data <- merge(cw,
	            ed.new,
	            by = c("ccode",
	            	     "year"
	            	     )
	            )
                        # Merge interpolated education data with civil war data

####################################
# Run principal component analysis #
####################################

mod.data <- na.omit(data.frame(country      = countrycode(data$ccode,
                                                          "cown",
                                                          "cowc"
                                                          ),
                               ccode        = data$ccode,
	                             year         = data$year,
	                             Industry     = data$industry,
	                             Service      = data$service,
	                             Agriculture  = data$agriculture,
	                             Urbanization = data$urb,
	                             Education    = data$educ
	                             )
                    )

mod.pca <- prcomp(mod.data[, -c(1 : 3)])

summary(mod.pca)
                        # Get summary of the PCA object, which includes
                        # variance explained for each of the dimensions

################################
# Plot results of PCA. This is #
# Figure 2 in the paper        #
################################

pdf("pca_plot.pdf")
  PCbiplot(mod.pca,
           obs.names = paste(mod.data$country,
                             mod.data$year,
                             sep = " "
                             ),
           dim.1     = "Modernization",
           dim.2     = "Material Culture"
           )
dev.off()

##################################
# Get country-level results for  #
# the two dimensions revealed by #
# principal component analysis.  #
# This is Figure 3 in the paper  #
##################################

data.rok <- which(substr(mod.data$country,
                         1,
                         3
                         ) == "ROK"
                  )
pdf("pca_plot_korea.pdf")
  PlotSub(mod.pca,
         obs.names = paste(mod.data$country,
                           mod.data$year,
                           sep = " "
                           ),
         dim.1     = "Modernization",
         dim.2     = "Material Culture",
         sub.plot  = data.rok
         ) +
  ggtitle("Republic of Korea, 1965-1999")
dev.off()

data.bra <- which(substr(mod.data$country,
                         1,
                         3
                         ) == "BRA"
                  )
pdf("pca_plot_brazil.pdf")
  PlotSub(mod.pca,
         obs.names = paste(mod.data$country,
                           mod.data$year,
                           sep = " "
                           ),
         dim.1     = "Modernization",
         dim.2     = "Material Culture",
         sub.plot  = data.bra
         ) + 
  ggtitle("Brazil, 1960-1999")
dev.off()

data.ukg <- which(substr(mod.data$country,
                         1,
                         3
                         ) == "UKG"
                  )
 pdf("pca_plot_uk.pdf")
  PlotSub(mod.pca,
         obs.names = paste(mod.data$country,
                           mod.data$year,
                           sep = " "
                           ),
         dim.1     = "Modernization",
         dim.2     = "Material Culture",
         sub.plot  = data.ukg
         ) +
  ggtitle("United Kingdom, 1970-1999")
dev.off()

data.mli <- which(substr(mod.data$country,
                         1,
                         3
                         ) == "MLI"
                  )
 pdf("pca_plot_mali.pdf")
  PlotSub(mod.pca,
         obs.names = paste(mod.data$country,
                           mod.data$year,
                           sep = " "
                           ),
         dim.1     = "Modernization",
         dim.2     = "Material Culture",
         sub.plot  = data.mli
         ) +
  ggtitle("Mali, 1967-1999")
dev.off()

###########################
# Plot results for both   #
# Mexico and Chile. These #
# are the right panels in #
# Figures 5 and 6         #
###########################

data.mex <- which(substr(mod.data$country,
                            1,
                            3
                            ) == "MEX"
                  )

 pdf("pca_plot_mexico.pdf")
  PlotSub(mod.pca,
         obs.names = paste(mod.data$country,
                           mod.data$year,
                           sep = " "
                           ),
         dim.1     = "Modernization",
         dim.2     = "Material Culture",
         sub.plot  = data.mex
        ) 
dev.off()

data.chl <- which(substr(mod.data$country,
                            1,
                            3
                            ) == "CHL"
                  )

 pdf("pca_plot_chile.pdf")
  PlotSub(mod.pca,
         obs.names = paste(mod.data$country,
                           mod.data$year,
                           sep = " "
                           ),
         dim.1     = "Modernization",
         dim.2     = "Material Culture",
         sub.plot  = data.chl
        ) 
dev.off()

#############################
# Add PCA variables to data #
#############################

mod.data$pca.mod <- mod.pca$x[, 1]
mod.data$pca.mat <- mod.pca$x[, 2]

data.new <- merge(data,
                  mod.data,
                  by = c("ccode",
                         "year"
                         ),
                  all.x = TRUE
                  )

data.new$country.x <- as.factor(data.new$country.x)
                        # R cannot write the data frame to .dta format
                        # unless this column is declared to be a factor

######################################
# Create necessary variables, lag    #
# time-varying components, and write #
# to a .dta file                     #
######################################

data.new$sixties <- ifelse(data.new$year >= 1960 & data.new$year < 1970,
                     1,
                     0
                     )

data.new$seventies <- ifelse(data.new$year >= 1970 & data.new$year < 1980,
                       1,
                       0
                       )

data.new$eighties <- ifelse(data.new$year >= 1980 & data.new$year < 1990,
                      1,
                      0
                      )

############################
# Create data set with the #
# appropriate variables    #
############################

mod.df <- data.frame(gdp       = data.new$lngdp, 
                     oil       = data.new$Oil, 
                     growth    = data.new$gdp.growth, 
                     polity    = data.new$polity, 
                     p2        = data.new$p2, 
                     pop       = data.new$lpop, 
                     ef        = data.new$ethfrac, 
                     mtn       = data.new$lmtnest, 
                     parity    = data.new$power_parity_pop,
                     mod       = data.new$pca.mod,
                     mat       = data.new$pca.mat, 
                     sixties   = data.new$sixties,
                     seventies = data.new$seventies,
                     eighties  = data.new$eighties,
                     onset     = data.new$onset_new,
                     ccode     = data.new$ccode,
                     year      = data.new$year
                     )

##############################
# Lag the relevant variables #
##############################

lag.df <- as.data.frame(apply(mod.df[, !(names(mod.df) %in% c("mtn",
                                                              "sixties",
                                                              "seventies",
                                                              "eighties",
                                                              "onset",
                                                              "ccode",
                                                              "year"
                                                              )
                                         )
                                     ],
                              2,
                              GenLag,
                              ccodes = mod.df$ccode, 
                              years  = mod.df$year
                              )
                        )

################################
# Add the non-lagged variables #
# back into the data set       #
################################

lag.df$mtn <- mod.df$mtn
lag.df$sixties <- mod.df$sixties
lag.df$seventies <- mod.df$seventies
lag.df$eighties <- mod.df$eighties
lag.df$ccode <- mod.df$ccode
lag.df$year <- mod.df$year
lag.df$onset <- mod.df$onset

###########################
# Write data for analysis #
###########################

write.dta(lag.df,
          "cw_replication_data.dta"
          )
                        # Write data for SPPO analysis in Stata

